Plotting for metric learning output
library(tidyverse)
library(dimRed)
library(reticulate)
library(here)
library(viridis)
library(hdrcde)
library(igraph)
library(matrixcalc)
library(akima)
library(car)
library(ggforce)
library(ks)
Jmisc::sourceAll(here::here("R/sources"))
set.seed(123)Metric learning
Smart meter data for one household
# load(here::here("data/spdemand_3639id336tow.rda"))
# nid <- 1
# ntow <- 336
#
# train <- spdemand %>%
# lazy_dt() %>%
# filter(tow <= ntow,
# # id <= sort(unique(spdemand[,id]))[nid],
# id == 1003) %>%
# dplyr::select(-id, -tow) %>%
# data.table::as.data.table()
train <- readRDS(here::here("data/spdemand_1id336tow_train.rds"))Radius searching with k-d trees
# Parameters fixed
x <- train
N <- nrow(train)
s <- 2
k <- 20
method <- "annIsomap"
annmethod <- "kdtree"
distance <- "euclidean"
treetype <- "kd"
searchtype <- "radius" # change searchtype for radius search based on `radius`, or KNN search based on `k`
radius <- .4 # the bandwidth parameter, \sqrt(\elsilon), as in algorithmThe metric learning algorithm
metric_isomap <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, invert.h = TRUE)
summary(metric_isomap)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
## laplacian 112896 dgCMatrix S4
The function metricML() returns a list of
- the embedding coordinates
embeddingof \(N \times s\) - the embedding metric
rmetricfor each point \(p \in x\) as an array. By default,invert.h = TRUEreturns the inverse of Riemannian metric. - the weighted neighborhood graph as an
igraphobjectweighted_graph - the sparse adjacency matrix from the graph
adj_matrix - the graph laplacian matrix
laplacian
Embedding plot
# fn <- metric_isomap$embedding
# head(fn)
# plot(fn, col = viridis::viridis(24), asp = 1)
plot_embedding(metric_isomap) +
labs(x = "ISO1", y = "ISO2")Ellipse plot
Using the Riemannian metric for each point as the 2-d covariance matrix and the 2-d embeddings as the centroid to plot an ellipse for each point.
The underlying requirement is that the Riemannian metric needs to be a square positive definite matrix.
plot_ellipse(metric_isomap, add = F, n.plot = 50, scale = 20,
color = blues9[5], fill = blues9[5], alpha = 0.2)# fn <- metric_isomap$embedding
# Rn <- metric_isomap$rmetric # array
# n.plot <- 50
# e <- riem2ellipse(Rn, scale = 20) %>%
# cbind(fn) %>%
# as_tibble()
# OR
# plot_embedding(metric_isomap, embedding = F) +
# plot_ellipse(metric_isomap, add = T)
# OR
# # plot_embedding(fn, embedding = T) +
# ggplot(data = e, aes(E1, E2)) +
# geom_point() +
# geom_ellipse(data = slice_sample(e, n = n.plot),
# aes(x0 = E1, y0 = E2, a = a, b = b, angle = angle, group = -1),
# color = blues9[5], fill = blues9[5], alpha = 0.2)Variable kernel density estimate
For multivariate data, the variable kernel density estimate is given by \[\hat{f}(x) = n^{-1} \sum_i K_{H_i} (x - X_i).\]
Outlier plot based on density estimates
Fixed bandwidth
# fixed bandwidth
fn <- metric_isomap$embedding
E1 <- fn[,1] # rename as Ed to match the aesthetics in plot_ellipse()
E2 <- fn[,2]
prob <- c(1, 50, 99)
p_outlier <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL) +
plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20,
color = blues9[5], fill = blues9[5], alpha = 0.2)
p_outlierPointwise adaptive bandwidth
Rn <- metric_isomap$rmetric # array
n.grid <- 10
f <- vkde2d(x = fn[,1], y = fn[,2], h = Rn, n = n.grid)
plot_contour(metric_isomap, n.grid = n.grid, scale = 1)source(here::here("R/sources/hdrplotting.R"))
p_outlier_vkde <- plot_outlier(x = metric_isomap, n.grid = 50, prob = prob)
# den <- f
# x <- E1; y <- E2
# <- = c(50, 95, 99)
# # prob = (1:10)*10
# # Convert prob to coverage percentage if necessary
# if(max(prob) > 50) {# Assume prob is coverage percentage
# alpha <- (100-prob)/100
# } else {# prob is tail probability (for backward compatibility)
# alpha <- prob}
# alpha <- sort(alpha)
# # Calculates falpha needed to compute HDR of bivariate density den.
# # Also finds approximate mode.
# fxy <- hdrcde:::interp.2d(den$x,den$y,den$z,x,y)
# falpha <- quantile(fxy, alpha)
# index <- which.max(fxy)
# mode <- c(x[index],y[index])
# hdr2d_info <- structure(list(mode=mode,falpha=falpha,fxy=fxy, den=den, alpha=alpha, x=x, y=y), class="hdr2d") # list for hdr.2d() output
# # plot.hdr2d(hdr2d_info, show.points = T, outside.points = T, pointcol = grey(0.5), xlim = round(range(x)), ylim = round(range(y)))
#
# p_outlier_vkde <- hdrscatterplot(E1, E2, levels = prob, noutliers = 20, label = NULL, den = hdr2d_info) +
# plot_ellipse(metric_isomap, add = T, n.plot = 50, scale = 20,
# color = blues9[5], fill = blues9[5], alpha = 0.2)library(patchwork)
(p_outlier + p_outlier_vkde ) + coord_fixed()t-SNE
x <- train
method <- "tSNE"
perplexity <- 30
theta <- 0.5
metric_tsne <- metricML(x, s = s, k = k, radius = radius, method = method, annmethod = annmethod, eps = 0, distance = distance, treetype = treetype, searchtype = searchtype, perplexity = perplexity, theta = theta, invert.h = TRUE)
summary(metric_tsne)## Length Class Mode
## embedding 672 -none- numeric
## rmetric 1344 -none- numeric
## weighted_graph 10 igraph list
## adj_matrix 112896 dgCMatrix S4
## laplacian 112896 dgCMatrix S4
ftsne <- metric_tsne$embedding
E1 <- ftsne[,1]; E2 <- ftsne[,2]
# plot_embedding(metric_tsne)
plot_ellipse(metric_tsne, n.plot = 50)plot_contour(metric_tsne, n.grid = 20, scale = 1/8)p_tsne <- plot_outlier(x = metric_tsne, n.grid = 20, prob = prob, noutliers = 20, scale = 1/8)
p_tsnehdrscatterplot(E1, E2, kde.package = "ks", noutliers = 20) +
plot_ellipse(metric_tsne, n.plot = 50, add = T)p_outlier_vkde + p_tsne